package WebFinder

import (
	"gitee.com/fierce_wolf/go-web-crawler/WebCrawler"
	"regexp"
)

type Address struct {
	List []*WebCrawler.WebElement // 家庭、办公地址的相关元素，其中text字段被识别为地址
}

func (e *Address) GetList() []*WebCrawler.WebElement {
	return e.List
}

func (e *Address) Find(els []*WebCrawler.WebElement) []*WebCrawler.WebElement {
	e.List = make([]*WebCrawler.WebElement, 0)

	// 初级筛选：找出至少包含【城市一级】的文本元素
	pattern := `([\p{Han}]{2,8}(?:市|盟))`
	reg := regexp.MustCompile(pattern)
	list := WebCrawler.Site.FindElements(els, WebCrawler.Filter{
		Mode:  WebCrawler.ModeRegexp,
		Param: reg,
	})

	// 二级筛选：找出匹配率达到3个元素以上的文本，才认为是地址格式
	list = e.matchAddress(list, 3)

	// 二级筛选：找出至少包含【省一级】的文本元素
	pattern = `([\p{Han}]{2,8}(?:省|自治区))`
	reg = regexp.MustCompile(pattern)
	list = WebCrawler.RegExp.FindWebTexts(list, reg)

	// 最后匹配：进行一次地址的格式化检测
	pattern = `([\p{Han}]{2,8}(?:省|自治区))?` + // xxx省
		`([\p{Han}]{2,8}市)` + // xxx市
		`([\p{Han}]{2,8}(?:区|县|旗))?` + // xxx区|县
		`([\p{Han}]{2,8}(?:街道|路|街|巷|道|弄))?` + // xxx街道|路|街|巷|道|弄
		`(?:\d+号)?` + // xxx号
		`([\p{Han}]{2,8})?` + // xxx小区
		`(?:\d+(?:栋|座|楼|号楼))?` + // xxx号楼
		`(?:\d+(?:室|房||号房|房间|单元))?` // xxx室|房||号房|房间|单元
	reg = regexp.MustCompile(pattern)
	originals := make([]*WebCrawler.WebElement, 0)
	for _, el := range list {
		matches := reg.FindAllString(el.Text, -1) // -1表示返回所有匹配项
		if len(matches) == 1 {
			// 原本
			originals = append(originals, el)

			// 副本
			clone := el.Clone()
			clone.Text = matches[0]
			e.List = append(e.List, clone)
		}
	}

	return originals
}

func (e *Address) matchAddress(els []*WebCrawler.WebElement, threshold int) []*WebCrawler.WebElement {
	list := make([]*WebCrawler.WebElement, 0)
	for _, node := range els {
		if e.matchAddressValue(node.Text) >= threshold {
			list = append(list, node)
		}
	}

	return list
}

func (e *Address) matchAddressValue(address string) int {
	count := 0

	// 检测：是否包含【省|自治区】
	pattern := `([\p{Han}]{2,8}(?:市|盟))`
	reg := regexp.MustCompile(pattern)
	matches := reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【省|自治区】
	pattern = `([\p{Han}]{2,8}(?:省|自治区))`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【区|县|旗】
	pattern = `([\p{Han}]{2,8}(?:区|县|旗))`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【街道|路|街|巷|道|弄】
	pattern = `([\p{Han}]{2,8}(?:街道|路|街|巷|道|弄))`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【号】
	pattern = `(?:\d+号)`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【栋|座|楼|号楼】
	pattern = `(?:\d+(?:栋|座|楼|号楼))`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	// 检测：是否包含【室|房||号房|房间|单元】
	pattern = `(?:\d+(?:室|房||号房|房间|单元))`
	reg = regexp.MustCompile(pattern)
	matches = reg.FindAllString(address, -1)
	if len(matches) > 0 {
		count += 1
	}

	return count
}
